First, we need to clean up the dataset a bit.
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyr)
airbnb = read.csv("airbnb.csv")
str(airbnb)
## 'data.frame': 500 obs. of 63 variables:
## $ id : int 14843458 722570 8153968 18327990 12673204 10344726 5737004 16826474 5414283 8610501 ...
## $ listing_url : Factor w/ 500 levels "https://www.airbnb.com/rooms/10043106",..: 131 431 453 316 56 4 391 243 381 464 ...
## $ name : Factor w/ 500 levels "#61 King Arthurs Court",..: 177 302 191 410 119 124 151 193 179 17 ...
## $ summary : Factor w/ 492 levels "","- Townhome 150 yds (2 min walk) from McCormick Place - Free Parking - Huge Room with queen bed (and futon) - Pr"| __truncated__,..: 124 1 283 421 463 126 280 292 323 175 ...
## $ space : Factor w/ 366 levels "","- This is a great, two bedroom, one bathroom home on the 1st floor of a four unit building. The first bedroom h"| __truncated__,..: 250 78 349 1 176 107 170 6 365 51 ...
## $ description : Factor w/ 499 levels "- Townhome 150 yds (2 min walk) from McCormick Place - Free Parking - Huge Room with queen bed (and futon) - Pr"| __truncated__,..: 127 117 286 427 470 129 283 295 326 178 ...
## $ neighborhood_overview : Factor w/ 331 levels "","------------------------------------------------------------------------------ SUPER CONVENIENT LOCATION!!! ---"| __truncated__,..: 102 1 119 1 86 186 187 3 82 220 ...
## $ notes : Factor w/ 225 levels "","- Any short-term stays of 3 or lesser nights (over a weekend) may be assessed a surcharge",..: 100 1 211 1 143 92 1 3 73 1 ...
## $ transit : Factor w/ 346 levels "","- 1 min walk to the Lawrence Red Line station - 1 mile from Metra station (commuter rail) - Divvy Bike station "| __truncated__,..: 227 1 269 1 59 100 300 9 131 252 ...
## $ access : Factor w/ 315 levels "","#94 is the door code. How to receive keys will be arranged ahead of time.",..: 194 1 312 1 163 297 1 52 172 22 ...
## $ interaction : Factor w/ 313 levels "","24/7 access to the host via phone, text, or email.",..: 301 1 300 1 121 178 1 223 103 198 ...
## $ house_rules : Factor w/ 318 levels "","-- Not handicap accessible (there are 59 stairs - no elevator!)",..: 107 151 41 225 98 176 267 236 55 143 ...
## $ host_id : int 20653807 3731751 16500117 6903096 34473759 45766549 29751294 111964625 2768284 37417772 ...
## $ host_url : Factor w/ 461 levels "https://www.airbnb.com/users/show/100027760",..: 106 205 82 369 185 264 156 33 143 206 ...
## $ host_name : Factor w/ 365 levels "Aama","Aamir",..: 170 211 314 117 206 327 111 208 68 218 ...
## $ host_since : Factor w/ 413 levels "01/02/2016","01/02/2017",..: 281 319 170 181 161 323 80 15 199 205 ...
## $ host_location : Factor w/ 31 levels "Barrington, Illinois, United States",..: 16 7 7 26 7 7 7 7 7 7 ...
## $ host_about : Factor w/ 319 levels "","\"Depth and breadth are crucial to creativity.\" -Adam M Grant \n\nChicago native. Buckeye nation.",..: 314 295 122 192 173 309 1 292 34 12 ...
## $ host_response_rate : Factor w/ 32 levels "0%","100%","33%",..: 2 2 2 2 2 2 32 2 2 10 ...
## $ host_is_superhost : Factor w/ 2 levels "f","t": 1 1 2 1 2 1 1 1 2 1 ...
## $ host_neighbourhood : Factor w/ 61 levels "","Albany Park",..: 1 28 1 1 28 28 28 28 1 21 ...
## $ host_verifications : Factor w/ 82 levels "['email', 'phone', 'amex', 'reviews', 'kba', 'work_email']",..: 23 71 24 76 71 71 71 26 70 76 ...
## $ host_has_profile_pic : Factor w/ 2 levels "f","t": 2 2 2 2 2 2 2 2 2 2 ...
## $ host_identity_verified : Factor w/ 2 levels "f","t": 2 2 2 1 2 2 2 1 2 1 ...
## $ street : Factor w/ 126 levels "Albany Park, Chicago, IL 60625, United States",..: 31 66 31 31 66 67 67 66 26 56 ...
## $ neighbourhood : Factor w/ 45 levels "Albany Park",..: 20 20 20 20 20 20 20 20 14 14 ...
## $ latitude : num 42 42 42 42 42 ...
## $ longitude : num -87.7 -87.7 -87.7 -87.7 -87.7 ...
## $ is_location_exact : Factor w/ 2 levels "f","t": 1 2 1 1 2 2 2 2 1 2 ...
## $ room_type : Factor w/ 3 levels "Entire home/apt",..: 1 1 2 1 1 1 1 1 2 1 ...
## $ accommodates : int 3 4 4 2 3 3 2 6 2 4 ...
## $ bathrooms : num 1 2 1 1 1 1 1 2 1 1 ...
## $ bedrooms : int 1 2 1 1 2 0 1 3 1 1 ...
## $ beds : int 1 2 2 1 2 2 1 4 1 2 ...
## $ bed_type : Factor w/ 5 levels "Airbed","Couch",..: 5 5 5 5 5 5 5 5 5 1 ...
## $ amenities : Factor w/ 495 levels "{\"Air conditioning\",Kitchen,\"Pets allowed\",Heating,\"Family/kid friendly\",\"Smoke detector\",\"Carbon mono"| __truncated__,..: 428 151 189 12 375 92 68 155 187 436 ...
## $ price : int 69 139 65 80 150 83 85 85 59 120 ...
## $ monthly_price : int NA NA 1600 NA NA 695 NA NA 1470 NA ...
## $ security_deposit : int NA 300 NA NA NA NA NA 150 NA NA ...
## $ cleaning_fee : int 20 80 15 NA 35 NA NA 59 20 NA ...
## $ guests_included : int 1 3 2 1 1 2 1 2 2 4 ...
## $ price_extra_people : int 48 20 10 0 0 10 0 15 0 25 ...
## $ maximum_nights : int 14 21 1125 1125 1125 1125 1125 1125 31 1125 ...
## $ calendar_updated : Factor w/ 22 levels "1 week ago","12 months ago",..: 19 5 5 5 5 19 16 21 15 8 ...
## $ availability_30 : int 6 1 16 0 0 0 0 9 8 11 ...
## $ availability_60 : int 7 1 43 0 8 3 0 15 36 23 ...
## $ availability_90 : int 13 5 73 0 19 16 0 15 61 30 ...
## $ availability_365 : int 13 280 73 129 19 73 0 46 332 305 ...
## $ review_scores_cleanliness : int 9 10 10 NA 10 10 10 8 10 9 ...
## $ review_scores_communication: int 10 10 10 NA 10 10 10 10 10 10 ...
## $ review_scores_value : int 10 10 10 NA 10 10 10 10 10 9 ...
## $ instant_bookable : Factor w/ 2 levels "f","t": 1 1 1 2 1 1 1 1 1 1 ...
## $ cancellation_policy : Factor w/ 3 levels "flexible","moderate",..: 1 2 2 1 2 2 2 3 2 1 ...
## $ reviews_per_month : num 0.96 0.29 2.61 NA 0.43 2.28 0.16 6.32 3.47 1.58 ...
## $ cable_tv : Factor w/ 2 levels "No","Yes": 1 2 2 1 1 1 1 2 2 1 ...
## $ wireless_internet : Factor w/ 2 levels "No","Yes": 2 2 2 1 2 2 2 2 2 2 ...
## $ kitchen : Factor w/ 2 levels "No","Yes": 2 2 2 1 2 2 2 2 2 2 ...
## $ pets_allowed : Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 1 1 1 2 ...
## $ breakfast : Factor w/ 2 levels "No","Yes": 1 1 1 1 1 2 1 1 2 1 ...
## $ heating : Factor w/ 2 levels "No","Yes": 2 2 2 1 2 2 1 2 2 2 ...
## $ X24.hour_checkin : Factor w/ 2 levels "No","Yes": 1 1 1 1 2 1 1 2 2 2 ...
## $ smoking_allowed : Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 2 1 1 1 ...
## $ smoking_allowed.2 : int 0 0 0 0 0 0 1 0 0 0 ...
# Setting descriptions, URLS, to character types
airbnb_char = airbnb %>% mutate_at(c(2:12, 14, 18, 25), as.character)
# Set numerics where they're needed
library(stringr)
airbnb_char$host_response_rate = as.numeric(str_remove(airbnb_char$host_response_rate, "%"))
## Warning: NAs introduced by coercion
# all others should be properly formatted
# Set boolean columns
airbnb_char = airbnb_char %>% mutate_at(c(20, 23, 24, 29, 52, 55:62), as.character) # R won't reassign unless column is of character type
airbnb_char[airbnb_char=="f"] = "false" # true and false are parseable by as.logical
airbnb_char[airbnb_char=="No"] = "false"
airbnb_char[airbnb_char=="t"] = "true"
airbnb_char[airbnb_char=="Yes"] = "false"
airbnb_clean = airbnb_char %>% mutate_at(c(20, 23, 24, 29, 52, 55:62), as.logical)
# boolean values are set to be TRUE and FALSE
# Set missing values to NA
airbnb_clean[airbnb_clean==""] = NA # Note: this should work for both character and factor variables
# Setting dates to date types
airbnb_clean$host_since = as.Date(airbnb_clean$host_since, format="%m/%d/%Y")
# this is done last because it breaks slicing
Now, we can start with some exploration of individual variables.
library(ggplot2)
First off, we’ll look at distributions of some simple numerical variables.
# While there are many libraries for this, we can easily whip up a quick summary statistics function
summary_stats = function(variable) {
cat("Total Count: ", length(variable), "\n")
cat("N: ", sum(!is.na(variable)), "\n")
cat("Minimum: ", min(variable, na.rm = TRUE), "\n")
cat("Maximum: ", max(variable, na.rm = TRUE), "\n")
cat("Mean: ", mean(variable, na.rm = TRUE), "\n")
cat("Standard Deviation: ", sd(variable, na.rm = TRUE), "\n")
cat("Median: ", median(variable, na.rm = TRUE), "\n")
cat("Quantiles:", "\n")
print(quantile(as.numeric(variable), c(0, 0.25, 0.5, 0.75, 1), na.rm = TRUE))
cat("IQR: ", IQR(variable, na.rm = TRUE), "\n")
}
Host response rate:
summary_stats(airbnb_clean$host_response_rate)
## Total Count: 500
## N: 480
## Minimum: 0
## Maximum: 100
## Mean: 96.23125
## Standard Deviation: 10.81784
## Median: 100
## Quantiles:
## 0% 25% 50% 75% 100%
## 0 100 100 100 100
## IQR: 0
Given that we have a median equal to our maximum, 100, along with such a high mean and low IQR, we clearly have a majority of hosts having a 100% response rate.
Number of guests accommodated:
summary_stats(airbnb_clean$accommodates)
## Total Count: 500
## N: 500
## Minimum: 1
## Maximum: 16
## Mean: 3.872
## Standard Deviation: 2.600219
## Median: 3
## Quantiles:
## 0% 25% 50% 75% 100%
## 1 2 3 5 16
## IQR: 3
Price:
summary_stats(airbnb_clean$price)
## Total Count: 500
## N: 500
## Minimum: 10
## Maximum: 950
## Mean: 135.416
## Standard Deviation: 125.9147
## Median: 100
## Quantiles:
## 0% 25% 50% 75% 100%
## 10 60 100 155 950
## IQR: 95
We will get four price categories based on our quantiles, which we’ll call very high, high, medium, and low, for later categorization use.
low = quantile(airbnb_clean$price)[2]
medium = quantile(airbnb_clean$price)[3]
high = quantile(airbnb_clean$price)[4]
price_category = vector(length = length(airbnb_clean$price))
lowcheck = sum(airbnb_clean$price <= low)
price_category[airbnb_clean$price <= low] = "low"
print(lowcheck == sum(price_category == "low"))
## [1] TRUE
medcheck = sum(airbnb_clean$price > low & airbnb_clean$price <= medium)
price_category[airbnb_clean$price > low & airbnb_clean$price <= medium] = "medium"
print(medcheck == sum(price_category == "medium"))
## [1] TRUE
hicheck = sum(airbnb_clean$price > medium & airbnb_clean$price <= high)
price_category[airbnb_clean$price > medium & airbnb_clean$price <= high] = "high"
print(hicheck == sum(price_category == "high"))
## [1] TRUE
vcheck = sum(airbnb_clean$price > high)
price_category[airbnb_clean$price > high] = "very high"
print(vcheck == sum(price_category == "very high"))
## [1] TRUE
# All checks seem right, let's see if our distribution checks out:
airbnb_clean$price_category = factor(price_category, levels = c("low", "medium", "high", "very high"))
ggplot(airbnb_clean, aes(price_category)) + geom_bar() + labs(x = "Price Category", y = "Count", title = "Price Category Distributions")
# Looks reasonably even.
Maximum nights:
summary_stats(airbnb_clean$maximum_nights)
## Total Count: 500
## N: 500
## Minimum: 3
## Maximum: 1125
## Mean: 736.986
## Standard Deviation: 518.5947
## Median: 1125
## Quantiles:
## 0% 25% 50% 75% 100%
## 3 30 1125 1125 1125
## IQR: 1095
Note the high proportion of 1125s, probably the maximum allowed for a listing by the website.
Reviews per month:
summary_stats(airbnb_clean$reviews_per_month)
## Total Count: 500
## N: 426
## Minimum: 0.05
## Maximum: 9.75
## Mean: 2.135563
## Standard Deviation: 1.960553
## Median: 1.485
## Quantiles:
## 0% 25% 50% 75% 100%
## 0.050 0.620 1.485 3.245 9.750
## IQR: 2.625
This gives an interesting variety, let’s look at a histogram.
ggplot(airbnb_clean, aes(reviews_per_month)) + geom_histogram() + labs(x = "Reviews Per Month", y = "Count", title = "Reviews Per Month Distribution")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 74 rows containing non-finite values (stat_bin).
We can see a right skew with a floor at 0, so we know most locations aren’t reviewed many times per month.
Cleanliness review score (out of 10):
summary_stats(airbnb_clean$review_scores_cleanliness)
## Total Count: 500
## N: 425
## Minimum: 4
## Maximum: 10
## Mean: 9.477647
## Standard Deviation: 0.8467998
## Median: 10
## Quantiles:
## 0% 25% 50% 75% 100%
## 4 9 10 10 10
## IQR: 1
Communication review score (out of 10):
summary_stats(airbnb_clean$review_scores_communication)
## Total Count: 500
## N: 424
## Minimum: 8
## Maximum: 10
## Mean: 9.856132
## Standard Deviation: 0.4016045
## Median: 10
## Quantiles:
## 0% 25% 50% 75% 100%
## 8 10 10 10 10
## IQR: 0
Value review score (out of 10):
summary_stats(airbnb_clean$review_scores_value)
## Total Count: 500
## N: 422
## Minimum: 6
## Maximum: 10
## Mean: 9.56872
## Standard Deviation: 0.6673718
## Median: 10
## Quantiles:
## 0% 25% 50% 75% 100%
## 6 9 10 10 10
## IQR: 1
Seems like both review categories seem to tend toward the high end. A left skew is likely. Let’s check:
ggplot(airbnb_clean, aes(review_scores_cleanliness)) + geom_histogram() + labs(x = "Cleanliness Review Score", y = "Count", title = "Cleanliness Review Scores Distribution")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 75 rows containing non-finite values (stat_bin).
ggplot(airbnb_clean, aes(review_scores_communication)) + geom_histogram() + labs(x = "Communication Review Score", y = "Count", title = "Communication Review Scores Distribution")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 76 rows containing non-finite values (stat_bin).
ggplot(airbnb_clean, aes(review_scores_value)) + geom_histogram() + labs(x = "Value Review Score", y = "Count", title = "Value Review Scores Distribution")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 78 rows containing non-finite values (stat_bin).
Unsurprising. People tend to give high reviews in the dataset.
# Let's look at some counts for factors to gauge their plottability
# Unique host names:
length(unique(airbnb_clean$host_name))
## [1] 365
# Unique host locations:
length(unique(airbnb_clean$host_location))
## [1] 31
# Unique host neighborhoods:
length(unique(airbnb_clean$host_neighbourhood))
## [1] 61
# Unique listing neighborhoods:
length(unique(airbnb_clean$neighbourhood))
## [1] 45
# Unique room types:
length(unique(airbnb_clean$room_type))
## [1] 3
# Unique bed types:
length(unique(airbnb_clean$bed_type))
## [1] 5
We can see that some variables have more values than are comfortably put into a bar graph, but we easily look at distributions for room and bed types.
ggplot(airbnb_clean, aes(room_type, fill = price_category)) + geom_bar() + labs(x = "Room Type", y = "Count", title = "Distribution of Room Types")
So, we can see that most rooms are an entire home or apartment, many are private rooms, and very few are shared rooms. Note that the majority of high and very high prices appear in the entire home category, unsurprisingly. Most low prices are in the private room category. While private rooms are not the least luxurious, they are much more common than shared rooms, which have a much higher proportion of low prices within its room category. In summary, higher price means more private space.
ggplot(airbnb_clean, aes(bed_type)) + geom_bar() + labs(x = "Bed Type", y = "Count", title = "Distribution of Bed Types")
The overwhelming majority of beds are real beds, with very few out of the 500 total listings in the data set being airbeds, couches, futons, or pull-out sofas.
# How about distributions of host start times?
ggplot(airbnb_clean, aes(host_since)) + geom_histogram() + labs(x = "Start Date", y = "Count", title = "Distribution of Host Start Dates")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
We have a left skewed distribution, meaning in this context that more currently active hosts started more recently than not. Ie, there has likely either been a sharp rise in hosts in the area over time or hosts tend to host for a short amount of time before quitting, leading to the active hosts beginning more recently. The former is more obvious, but we would need data about inactive hosts to rule out the latter.
Let’s look at some data involving individual hosts.
# A count of unique host id's will tell us how many hosts we have
print(length(unique(airbnb_clean$host_id)))
## [1] 461
# Out of curiosity, does this match the unique host names?
print(length(unique(airbnb_clean$host_name)))
## [1] 365
So, it seems at least one host (almost certainly more) has multiple listings available, given that our dataset has 500 listings. Also, hosts share names.
What are some common names?
repeat_names = (sort(table(airbnb_clean$host_name), decreasing = TRUE))
repeat_names[repeat_names > 1]
##
## Joe Sonder John Paul Justin
## 10 9 7 7 6
## Nick Michael Chris David Jessica
## 6 5 4 4 4
## Laura Lisa Liz Mark Matt
## 4 4 4 4 4
## Mike The Flats Amanda Charles Dan
## 4 4 3 3 3
## Daniel Freehand James Jennifer Maria
## 3 3 3 3 3
## Mario Nicole Sarah Sharon Stephanie
## 3 3 3 3 3
## Tom Ami Andrew Ashley At Home Inn
## 3 2 2 2 2
## Catherine Christopher Dana Emily Frank
## 2 2 2 2 2
## Jane Jeff Jenny Jimmy Kari
## 2 2 2 2 2
## Kate Kim Kristi Leon Liliana
## 2 2 2 2 2
## Lori Mary Megan Mejai Kai Melanie & Joe
## 2 2 2 2 2
## Monica Natalia Pamela Peter Rebecca
## 2 2 2 2 2
## Ross Steve Terry Thomas Trevor
## 2 2 2 2 2
Some of these are obviously common names, eg Joe, John, Paul. Some seem to be a single business with multiple properties, like “At Home Inn” and “The Flats”. Similarly, we find that our second most common host name, “Sonder”, is also such a business with a bit of research.
Let’s look at ratings by name
# First, we make a total rating by adding the three rating types
airbnb_clean = airbnb_clean %>% mutate(review_scores_total = (review_scores_communication + review_scores_cleanliness + review_scores_value))
avg_score_by_name = airbnb_clean %>% group_by(host_name) %>% summarize(avg = mean(review_scores_total))
avg_score_by_name = na.omit(avg_score_by_name)
avg_score_by_name = avg_score_by_name %>% arrange(desc(avg))
head(avg_score_by_name, 15)
## # A tibble: 15 x 2
## host_name avg
## <fct> <dbl>
## 1 Adi 30
## 2 Alan 30
## 3 Alex 30
## 4 Alexander 30
## 5 Ali 30
## 6 Alissa 30
## 7 Amber 30
## 8 Amrit Rania 30
## 9 Amy 30
## 10 Andy 30
## 11 Anjli 30
## 12 Anna-Lisa 30
## 13 Anne 30
## 14 Anne-Marie 30
## 15 April 30
We can see have a lot of 30/30 for total scores, meaning many people got very good ratings as mentioned earlier. Unique, however, are low scores. Let’s look at the bottom three.
tail(avg_score_by_name, 3)
## # A tibble: 3 x 2
## host_name avg
## <fct> <dbl>
## 1 Wilson 22
## 2 Niki 20
## 3 Sam 18
Since these three scores are unique, we can use them as keys for finding more info on the users. Had we used host_id, we could have used that as well, but here we used names for the sake of readability.
sam = airbnb_clean %>% filter(review_scores_total == 18 & host_name == "Sam")
niki = airbnb_clean %>% filter(review_scores_total == 20 & host_name == "Niki")
wilson = airbnb_clean %>% filter(review_scores_total == 22 & host_name == "Wilson")
We can get a quick data frame of our three hosts of interest with host relevant data.
low_rated = rbind(sam, niki, wilson)
Let’s look at their ratings.
# Note: 49, 50, 51 are the review score columns, 15 is host name
low_rated[,c(15, 49:51)]
## host_name review_scores_cleanliness review_scores_communication
## 1 Sam 4 8
## 2 Niki 6 8
## 3 Wilson 6 8
## review_scores_value
## 1 6
## 2 6
## 3 8
So, it seems that having good communication all around, the hosts have average to good value scores and poor to average cleanliness, their weakest factor in general. Let’s see how cleanliness generally compares to other review scores across all hosts.
avg_scores_by_name = airbnb_clean %>% group_by(host_name) %>% summarize(avg_cl = mean(review_scores_cleanliness), avg_com = mean(review_scores_communication), avg_val = mean(review_scores_value) , avg_total = mean(review_scores_total))
avg_scores_by_name = na.omit(avg_scores_by_name)
Now, we can look at summary stats of each rating variable
Cleanliness:
summary_stats(avg_scores_by_name$avg_cl)
## Total Count: 292
## N: 292
## Minimum: 4
## Maximum: 10
## Mean: 9.477645
## Standard Deviation: 0.8622203
## Median: 10
## Quantiles:
## 0% 25% 50% 75% 100%
## 4 9 10 10 10
## IQR: 1
Communication:
summary_stats(avg_scores_by_name$avg_com)
## Total Count: 292
## N: 292
## Minimum: 8
## Maximum: 10
## Mean: 9.860826
## Standard Deviation: 0.3800228
## Median: 10
## Quantiles:
## 0% 25% 50% 75% 100%
## 8 10 10 10 10
## IQR: 0
Value:
summary_stats(avg_scores_by_name$avg_val)
## Total Count: 292
## N: 292
## Minimum: 6
## Maximum: 10
## Mean: 9.585331
## Standard Deviation: 0.6301351
## Median: 10
## Quantiles:
## 0% 25% 50% 75% 100%
## 6 9 10 10 10
## IQR: 1
We can see that thought our base review scores don’t vary too much in mean, cleanliness has the lowest mean (~9.48), the lowest minimum (4), and the highest standard deviation (~0.862). So, there does seem to be a bit of a lower lump in the tail of our distribution (which we saw in the histograms above). Let’s see how many 4s we actually have.
sum(na.omit(airbnb_clean$review_scores_cleanliness) == 4)
## [1] 1
Just one it seems. Though we only have 500 listings, it seems Sam’s is particularly dirty.
We can predict some practical values using linear regression models. [Karthik]
The dataset provides useful location data for each listing that can provide some insight into different geographical areas.
library(ggmap)
## Google's Terms of Service: https://cloud.google.com/maps-platform/terms/.
## Please cite ggmap if you use it! See citation("ggmap") for details.
library(RColorBrewer)
bbox = c(-87.8,41.73,-87.5,42.05)
m = get_stamenmap(bbox,zoom=12)
## Source : http://tile.stamen.com/terrain/12/1049/1519.png
## Source : http://tile.stamen.com/terrain/12/1050/1519.png
## Source : http://tile.stamen.com/terrain/12/1051/1519.png
## Source : http://tile.stamen.com/terrain/12/1052/1519.png
## Source : http://tile.stamen.com/terrain/12/1049/1520.png
## Source : http://tile.stamen.com/terrain/12/1050/1520.png
## Source : http://tile.stamen.com/terrain/12/1051/1520.png
## Source : http://tile.stamen.com/terrain/12/1052/1520.png
## Source : http://tile.stamen.com/terrain/12/1049/1521.png
## Source : http://tile.stamen.com/terrain/12/1050/1521.png
## Source : http://tile.stamen.com/terrain/12/1051/1521.png
## Source : http://tile.stamen.com/terrain/12/1052/1521.png
## Source : http://tile.stamen.com/terrain/12/1049/1522.png
## Source : http://tile.stamen.com/terrain/12/1050/1522.png
## Source : http://tile.stamen.com/terrain/12/1051/1522.png
## Source : http://tile.stamen.com/terrain/12/1052/1522.png
## Source : http://tile.stamen.com/terrain/12/1049/1523.png
## Source : http://tile.stamen.com/terrain/12/1050/1523.png
## Source : http://tile.stamen.com/terrain/12/1051/1523.png
## Source : http://tile.stamen.com/terrain/12/1052/1523.png
## Source : http://tile.stamen.com/terrain/12/1049/1524.png
## Source : http://tile.stamen.com/terrain/12/1050/1524.png
## Source : http://tile.stamen.com/terrain/12/1051/1524.png
## Source : http://tile.stamen.com/terrain/12/1052/1524.png
#made a new dataframe so only the listings with exact locations were used in mapping plots (where is_location_exact = TRUE))
airbnb_clean2 = airbnb_clean[airbnb_clean$is_location_exact==TRUE,]
#dropping West Elsdon neighbourhood because there is only one listing in that neighbourhood and it was skewing the data/graphs due to its irregularity
airbnb_clean2 = airbnb_clean2[airbnb_clean2$neighbourhood!='West Elsdon',]
ggmap(m) +
geom_point(data = airbnb_clean2, aes(x = longitude, y = latitude, color=price_category), size=.9, alpha=.75) +
coord_sf(xlim = c(-87.8, -87.5), ylim = c(41.73, 42.05), expand = FALSE)+
labs(title="Price Categories Mapped", y = "Latitude",x = "Longitude", color="Price Category")+
scale_color_brewer(palette="Dark2")
## Coordinate system already present. Adding new coordinate system, which will replace the existing one.
## Warning: Removed 2 rows containing missing values (geom_point).
This plot shows us that the listings outside of the cities are usually cheaper than the listing inside the city, due to the high presence of orange and green dots on the outskirts of the graph, with more purple and blue as you get closer to the Loop, the heart of the city. There also seems to be a large number of purple points in the northern part of the city, towards Lake View and Lincoln Park (The area known as “Wrigleyville”).
ggplot(data=airbnb_clean2, aes(x=reorder(neighbourhood,-price), y=price)) +
geom_bar(position = "dodge", stat = "summary", fun.y = "mean", fill='Light Blue', color = 'Red')+
theme(axis.text.x = element_text(angle=90,hjust=0.95,vjust=0.2))+
scale_y_continuous(breaks = seq(0, max(airbnb_clean$price), by = 20))+
labs(title="Neighbourhoods By Mean Price", y = "Mean Price ($)",x = "Neighbourhood")+
stat_summary(aes(label=round(..y..,2)), fun.y=mean, geom="text", size=3, hjust=-0.1, angle=90)
#the labels on each bar were made using code similar to the one from user agstudy on stack over flow here:
#https://stackoverflow.com/questions/20139978/ggplot2-label-values-of-barplot-that-uses-fun-y-mean-of-stat-summary
This graph tells us that our assumptions from the map were mostly true, as the neighborhoods that are closer to the center (the Loop, Near North Side, ) of the city are more expensive on average, and some northern neighborhoods (Lake View, North Center, Lincoln Park) are also towards the top of this graph.
ggmap(m) +
geom_point(data = airbnb_clean2, aes(x = longitude, y = latitude, color=room_type), size = .9, alpha=.75) +
coord_sf(xlim = c(-87.8, -87.5), ylim = c(41.73, 42.05), expand = FALSE)+
labs(title="Room Types Mapped", y = "Latitude",x = "Longitude", color="Room Type")+
scale_color_brewer(palette="Dark2")
## Coordinate system already present. Adding new coordinate system, which will replace the existing one.
## Warning: Removed 2 rows containing missing values (geom_point).
From this graph, we can assume that there are more entire units listed in the middle of the middle of the city due to the overwhelming green. This may be because the apartments in the city are smaller, and it would be harder for a guest to share that room with somebody else. We can see more orange on the northern and western parts of the city, as these are typically where single family homes are built.
ggplot(airbnb_clean2, aes(x=reorder(neighbourhood,-price), fill=room_type))+geom_bar(position="fill")+
theme(axis.text.x = element_text(angle=90,hjust=0.95,vjust=0.2))+
labs(title="Neighbourhood Room Types Ordered by Mean Price Descending", y = "Room Type %",x = "Neighbourhood", fill="Room Type")+
scale_fill_brewer(palette="Set2")
This graph tells us some valuable information about the listings in the city. As the prices go up, typically, the odds of the listing being a entire apartment or house go up. This information could be useful to a potential lister in Chicago, if they want to charge more for their listing, they may have to rent out their whole unit.
ggmap(m) +
geom_point(data = airbnb_clean2, aes(x = longitude, y = latitude, color=factor(bedrooms)), size=.9, alpha=.75) +
coord_sf(xlim = c(-87.8, -87.5), ylim = c(41.73, 42.05), expand = FALSE)+
labs(title="# of Bedrooms Mapped", y = "Latitude",x = "Longitude", color="Bedrooms")+
scale_color_brewer(palette="Set1")
## Coordinate system already present. Adding new coordinate system, which will replace the existing one.
## Warning: Removed 3 rows containing missing values (geom_point).
From this plot we can see that it is rare for a listing in Chicago to have more than one or two bedrooms, based on the fact that there are a large number of blue, red and green points in the middle of the city. The presence of red points tell us that there are more studio apartments in the area, which would make sense considering the red points on this graph are right near the Magnificent Mile, one of the most sought after, and expensive, areas in the city.
ggplot(data=airbnb_clean2, aes(x=reorder(neighbourhood,-price), y=bedrooms)) +
geom_bar(position = "dodge", stat = "summary", fun.y = "mean", fill='Light Blue', color = 'Red')+
theme(axis.text.x = element_text(angle=90,hjust=0.95,vjust=0.2))+
labs(title="Mean Bedrooms per Neighbourhood Sorted by Mean Price Descending", y = "Mean Bedrooms",x = "Neighbourhood")+
stat_summary(aes(label=round(..y..,2)), fun.y=mean, geom="text", size=2, vjust = -0.5)
## Warning: Removed 1 rows containing non-finite values (stat_summary).
## Warning: Removed 1 rows containing non-finite values (stat_summary).
This bar chart tells us that most listings in the city lie between one and two bedrooms. This graph also seems to show less of a relationship than the room type bar graph, as it does not seem to flow in a certain direction like the other one did.
ggmap(m) +
geom_point(data = airbnb_clean2, aes(x = longitude, y = latitude, color=factor(accommodates)), size=.9, alpha=.75) +
coord_sf(xlim = c(-87.8, -87.5), ylim = c(41.73, 42.05), expand = FALSE)+
labs(title="Accomodations for # of People", y = "Latitude",x = "Longitude", color="# of People")
## Coordinate system already present. Adding new coordinate system, which will replace the existing one.
## Warning: Removed 2 rows containing missing values (geom_point).
Here we can see that the normal listing in the middle of Chicago is not meant to accommodate more that ~5 people, given that there are mainly orange, gold and gold/green points in the heart of the city. This tracks with our previous graph of bedrooms, as less bedrooms would mean less guests could stay in that listing.
ggplot(data=airbnb_clean2, aes(x=reorder(neighbourhood,-price), y=accommodates)) +
geom_bar(position = "dodge", stat = "summary", fun.y = "mean", fill='Light Blue', color = 'Red')+
theme(axis.text.x = element_text(angle=90,hjust=0.95,vjust=0.2))+
scale_y_continuous(breaks = seq(0, max(airbnb_clean$accommodates), by = 1))+
labs(title="Mean # of Guest Accommodations per Neighbourhood Ordered by Mean Price", y = "Mean Accommodations",x = "Neighbourhood")+
stat_summary(aes(label=round(..y..,2)), fun.y=mean, geom="text", size=3, angle=90, hjust=-.11)
Here, we can see that neighbourhoods with higher priced listings do have some larger accommodation sizes that other less expensive neighbourhoods. This graph also tells us that most listings fit around 3-4 people, which does make sense logically, as most apartments or units typically have a 2 person bed and a couch. Many AirBnB hosts make sure that couch is a sleeper sofa, which would account for 1-2 more people.
Several of the variables here come in the form of long text descriptions from which useful information can be pulled with some effort. [Kaleb]